RYM Data Analysis
A Recreation of Several Plots I Made In Excel
library(ggplot2)
library(plotly)
theme_set(theme_minimal())
# READING DATA
# Read in a CSV file without headers
data <- read.csv(file = "D:\\Downloads\\rymList (3).csv", header = FALSE)
# drop na
data <- na.omit(data)
# Manually assign the header names
names(data) <- c("Artist", "Album", "Year", "Date", "Rating")
# convert date column from type string to type date
data$Date <- as.Date(data$Date, format = "%d/%m/%Y")
# function to convert "x.x0 stars" to int
convertRating <- function(ratingStr) {
ratingStr <- substr(ratingStr, 1, 3) # trim chars
ratingNum <- as.double(ratingStr) * 2 # convert type
return(ratingNum)
}
# use the function on the rating column
data$Rating <- sapply(data$Rating, convertRating)
# remove rows with NA rating (not yet rated at time of scraping)
data <- data[!is.na(data$Rating), ]
# plot()(data$Date, data$Year,
# col = "#4472C4",
# pch = 19,
# main = "Release Years of Rated Albums vs. Time",
# xlab = "Time",
# ylab = "Release Years of Rated Albums"
# )
# plot(data$Date, data$Rating,
# ylim = c(1, 10),
# col = "#4472C4",
# pch = 19,
# main = "Albums Ratings vs. Time",
# xlab = "Time",
# ylab = "Albums Ratings"
# )
p1 <- ggplot(data,
aes(
x = Date,
y = Year
)
) +
geom_point(color = "#4472C4") +
labs(
title = "Date Rated / Album Year",
x = "Date Rated",
y = "Year of Release",
)
ggplotly(p1)
p2 <- ggplot(data,
aes(
x = Date,
y = Rating,
group = 1,
text = paste(
"Album: ", Album,
"</br>Artist: ", Artist
)
),
aes_string(
group = column
)
) +
geom_point(color = "#4472C4") +
labs(
title = "Date Rated / Album Rating",
x = "Date Rated",
y = "Rating",
)
ggplotly(p2)
Time Series Plots
# reverse list order and save in new var
df <- data[nrow(data):1, ]
# renumber index
row.names(df) <- 1:nrow(df)
# give it its own name so it appears that way in interactive hover
index <- as.numeric(row.names(df))
# plot list index against time
# dummy aesthetic: https://stackoverflow.com/a/43763132
tplot <- ggplot(
data = df,
aes(
x = Date,
y = index,
color = Rating,
group = 1,
text = paste(
"Album: ", Album,
"</br>Artist: ", Artist
)
),
aes_string(
x = "name", y = "rate", colour = column,
group = column
)
) +
geom_line(color = "#00AFBB", size = 1, linetype = "dashed") +
geom_point(size = 0.5) +
labs(
title = "Total number of albums rated / Time",
x = "Time",
y = "Total number of albums rated",
)
ggplotly(
p = tplot,
tooltip = c("x", "y", "text")
)
Histogram
p <- ggplot(df, aes(x = Rating)) +
geom_histogram(binwidth = 1, color = "darkblue", fill = "lightblue") +
scale_x_continuous(breaks = seq(1, 10, by = 1))
ggplotly(p)
rankplot <- ggplot(df, aes(x = Year)) +
geom_histogram(binwidth = 1, color = "darkblue", fill = "lightblue") +
scale_x_continuous(
breaks = seq(floor(min(df$Year)/10)*10,
max(df$Year),
by = 10
)
)
# coord_flip()
ggplotly(rankplot)
Scatterplot
# create new dataframe for year counts
df2 <- aggregate(cbind(count = Album) ~ Year,
data = df,
FUN = function(x){NROW(x)})
# add index as a new column
df2$Index <- 1:nrow(df2)
# sort by count
df2 <- df2[order(df2$count),]
# renumber index
row.names(df2) <- 1:nrow(df2)
ranking <- as.numeric(row.names(df2))
yearplot <- ggplot(df2,
aes(
x = Year
)
) +
geom_point(color = "#4472C4", aes(y = Index)) +
geom_point(color = "#ED7D31", aes(y = ranking)) +
scale_x_continuous(
breaks = seq(floor(min(df$Year)/10)*10,
max(df$Year),
by = 10
)
)
ggplotly(yearplot)